import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import sys
import json
import plotly.figure_factory as ff
import plotly.express as px
sys.tracebacklimit = 0 # turn off the error tracebacks
Problem 1 - Data Extraction¶
A: Get Beautiful Soup object of half - marathons within 200 miles of virginia beach
B: Build a function to scrape date, race_title, city, and race distances from the website
C: Build For loop to turn scraping function into a spider for races further into the future
D: Save the data for each loop as a df and
# Part 1A: Get BeautifulSoup object of running in the usa page 1
r = requests.get('https://httpbin.org/user-agent')
useragent = json.loads(r.text)['user-agent']
headers = {'User-Agent': useragent,
'from': 'vrd9sd@virginia.edu'}
url = 'https://runningintheusa.com/classic/list/within-200-miles-of-virginia%20beach-va/upcoming/half-marathon/miles-between-250/page-1'
r = requests.get(url, headers=headers)
# Parsing HTML code
mysoup = BeautifulSoup(r.text, 'html.parser')
# part 1B: function to scrape races from the running in the usa website
def race_df(url):
""" function for returning df w/ race info given a url
Args:
url: (str) a string of a race
Returns:
races: (df) a DataFrame containg race dates, titles, cities
"""
r = requests.get(url, headers=headers)
mysoup = BeautifulSoup(r.text, 'html.parser')
titles = [x.b for x in mysoup.find_all('td', attrs = {'style': 'text-decoration:inherit;'})]
titles = [x.string for i, x in enumerate(titles) if i%2==1]
rowspan_1_2_cities = [x.b for x in mysoup.find_all('td', attrs={'rowspan':['1', '2']})]
cities = rowspan_1_2_cities[2:]
cities = [x.string for i,x in enumerate(cities) if i%4==0] #and i > 7]
dates = [x.string for x in mysoup.find_all('div', attrs = {'style':"font-weight:bold"})]
dates = [x for i,x in enumerate(dates) if i>1 and i<= (len(cities) + 1)]
distances = [x.string for x in mysoup.find_all('div', attrs={'style':"padding-left:10px"}) if x.string and (x.string.endswith('run') or x.string.endswith('relay'))]
if len(dates) == len(titles) == len(cities) ==len(distances): # ensure I am getting each record (lenths should be the same)
# Part 1D: Makes races df, use indexing to feature out featured listings at top
races = pd.DataFrame({
'date' : dates[2:],
'race': titles[2:],
'city': cities[2:],
'distance': distances[2:]})
else:
races = -1
return races
new_df = race_df(url)
# Part 1C: Turn this web-scraper into a spider!
for i in range(2, 7):
url = url[:-1] + str(i) # Insert new number into url str for pages 2-6
new_df = pd.concat([new_df, race_df(url)], ignore_index=True) # use scraping function as spider
new_df
| date | race | city | distance | |
|---|---|---|---|---|
| 0 | Dec 14, 2024 | Pocahontas Half | Chesterfield, VA | 13.1M, 10K, 5K run |
| 1 | Dec 15, 2024 | Holiday Half Marathon | Annandale, VA | 13.1M, 4M run |
| 2 | Dec 21, 2024 | Naptown Half Marathon | Annapolis, MD | 13.1M, 10K, 5K run |
| 3 | Dec 21, 2024 | Oakwood 24 | Raleigh, NC | 24H, 12H, 50M, 50K, 26.2M, 13.1M, 5K run |
| 4 | Dec 21, 2024 | Rudolph's Race 5K & Half | Washington, DC | 13.1M, 5K run |
| ... | ... | ... | ... | ... |
| 115 | Apr 26, 2025 | Spring Third Winchester Battlefield 5K, 10K, &... | Winchester, VA | 13.1M, 10K, 5K trail run |
| 116 | Apr 26, 2025 | Wake Forest Historic Half | Wake Forest, NC | 13.1M run |
| 117 | Apr 27, 2025 | Blackbeard's Half Marathon | Ocracoke, NC | 13.1M run |
| 118 | Apr 27, 2025 | Runners Half Marathon of Reston | Reston, VA | 13.1M, 5K run |
| 119 | May 3, 2025 | Neuse River Bridge Run | New Bern, NC | 13.1M, 10K, 5K run |
120 rows × 4 columns
Problem 2 - Data cleaning¶
A: Convert the race dates to datetime objects
B: Add a day of the week column for races
C: Order the Columns in Race DF to be aesthetically pleasing
D: Save the race data to a csv
new_df['date'] = pd.to_datetime(new_df.date) # Part 2A
new_df['day'] = new_df['date'].dt.day_name() # Part 2B
new_df = new_df[['day', 'date', 'race', 'city', 'distance']] # Part 2C
csv_file_path = 'half_marathon_data.csv'
import os
if not os.path.exists(csv_file_path):
new_df.to_csv(csv_file_path, index=False) # Part 2D
Problem 3 - EDA¶
A: Show how many races featuring a half marathon each city within 200 miles of va beach has on running the USA site
B: Make a table of all the races
C: Show how many races occur on each date
D: Identify the races that dont occur on saturdays and sundays
# Problem 3A:
city_race_count = new_df.groupby(['cities']).agg({'races':'count'}).sort_values(by='races', ascending=False)
city_race_count = city_race_count.reset_index(drop=False).rename({'cities': 'city'}, axis=1)
city_race_count.head()
| city | races | |
|---|---|---|
| 0 | Washington, DC | 10 |
| 1 | Colonial Heights, VA | 7 |
| 2 | Williamsburg, VA | 5 |
| 3 | Raleigh, NC | 5 |
| 4 | Cary, NC | 3 |
# Problem 3B
table=ff.create_table(new_df)
# table
# Problem 3C
date_race_count = new_df.groupby('date').agg({'race':'count'})
date_race_count = date_race_count.reset_index()
date_race_count.head()
| date | race | |
|---|---|---|
| 0 | 2024-10-05 | 6 |
| 1 | 2024-10-06 | 4 |
| 2 | 2024-10-12 | 5 |
| 3 | 2024-10-13 | 1 |
| 4 | 2024-10-19 | 7 |
# Part D problem
new_df.query("day != 'Saturday' & day!= 'Sunday'")
| day | date | race | city | distance | |
|---|---|---|---|---|---|
| 52 | Monday | 2024-11-11 | Service and Sacrifice 5K & Half | Washington, DC | 13.1M, 5K run |
| 63 | Thursday | 2024-11-28 | Pie Gobbler 1M, 5K, 10K, 15K, and Half Marathon | Williamsburg, VA | 13.1M, 15K, 10K, 5K, 1M run |
| 64 | Thursday | 2024-11-28 | Skinny Turkey Half Marathon | Raleigh, NC | 13.1M, 10K, 5K run | kids run |
| 87 | Wednesday | 2025-01-01 | New Year Day 1M, 5K, 10K, 15K, and Half Marathon | Williamsburg, VA | 13.1M, 15K, 10K, 5K, 1M run |
Problem 4 - Data Vizualization¶
A: Create a table of months of year, counts of races and day
B: Create an interactive barplot that shows the counts of half-marathons by day as well as month they occur in
C: Create line chart of races containing a half marathon by month
# Part A: Table of race county by month and day
date_count = date_df.groupby(['day', 'month'])['race'].count().reset_index()
date_count.head()
| day | month | race | |
|---|---|---|---|
| 0 | Monday | 11 | 1 |
| 1 | Saturday | 1 | 4 |
| 2 | Saturday | 2 | 3 |
| 3 | Saturday | 3 | 4 |
| 4 | Saturday | 4 | 8 |
# Part B show Races by day of week
import plotly.offline as pyo
pyo.init_notebook_mode()
day_count = new_df['day'].value_counts().reset_index()
day_count.columns = ['day', 'count']
fig = px.bar(day_count, x='day', y='count',
text='count',
title = 'number of races w/ half marathons within 200 miles of Virginia Beach categorized by day',
color='day',
labels = {'count': 'number of races'})
fig.show()
# Part 4C: Create Line Chart
new_df['year_month'] = new_df['date'].dt.to_period('M') # make new column for year month
monthly_race_count = new_df.groupby('year_month').size().reset_index(name='count') # count races by year_month
monthly_race_count['year_month'] = monthly_race_count['year_month'].dt.to_timestamp() # timestamp year months
fig = px.line(monthly_race_count, x='year_month', y='count', # plot line chart
title='Number of Races with a half-marathon within 200 miles of VB by Month',
labels={'year_month': 'Month', 'count': 'Count of Races'})
fig.show()
# Import the required library
# from geopy.geocoders import Nominatim
# geolocator = Nominatim(user_agent="myGeocoder for student project")
# lat = geolocator.geocode("Cary, NC")
'Washington'
# Assuming you have a latitude and longitude for each city
# fig = px.scatter_geo(new_df, locations="city", hover_name="race", title="Races by Location")
# fig.show()